# Setup environment ----
library(parallel)
library(data.table)
source("R/functions.R")
results_path <- "results/replication/"
load("data/data-2021-02-22.RData")
senate_data_lag <- copy(senate_data)

# Senate data ----
senate_data <- senate_data[jungle == 0 & weird_race == 0 & special == 0 &
    !is.na(ideological_distance) & !is.na(dem_vote) & !is.na(rep_vote) &
    !is.na(real_dem_expenditure_advantage_w_outside)]
sen_tail_lims <- senate_data[,
  quantile(real_dem_expenditure_advantage_w_outside, c(.05, .95),
    na.rm = TRUE)]
sen_X <- data.matrix(senate_data[, .(
  dem_spend_adv = real_dem_expenditure_advantage_w_outside *
    as.numeric(
      real_dem_expenditure_advantage_w_outside >= sen_tail_lims[1] &
        real_dem_expenditure_advantage_w_outside <= sen_tail_lims[2]),
  log_total_spending = log10(real_total_spending_w_outside),
  dem_presvote_advantage,
  adj_dem_presvote_advantage,
  log_dem_presvote = log10(dem_presvote),
  log_rep_presvote = log10(rep_presvote),
  ideological_distance,
  dem_cfscore, rep_cfscore, open_seat, dem_incumbent,
  log_voting_eligible_population = log10(voting_eligible_population),
  y80, y82, y84, y86, y88,
  y90, y92, y94, y96, y98,
  y00, y02, y04, y06, y08,
  y10, y12, y14, y16, y18,
  bottom_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside < sen_tail_lims[1]),
  top_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside > sen_tail_lims[2]),
  bottom_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside < sen_tail_lims[1]),
  top_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside > sen_tail_lims[2]),
  middle_90 =
    as.numeric(
      real_dem_expenditure_advantage_w_outside <= sen_tail_lims[2] &
        real_dem_expenditure_advantage_w_outside >= sen_tail_lims[1]))])
sen_y <- senate_data[, cbind(dem_vote_share)]

#   data for randomForest ----
sen_X_rf <- data.matrix(senate_data[, .(
  dem_spend_adv = real_dem_expenditure_advantage_w_outside,
  log_total_spending = log10(real_total_spending_w_outside),
  dem_presvote_advantage,
  adj_dem_presvote_advantage,
  log_dem_presvote = log10(dem_presvote),
  log_rep_presvote = log10(rep_presvote),
  ideological_distance,
  dem_cfscore, rep_cfscore, open_seat, dem_incumbent,
  log_voting_eligible_population = log10(voting_eligible_population),
  y80, y82, y84, y86, y88,
  y90, y92, y94, y96, y98,
  y00, y02, y04, y06, y08,
  y10, y12, y14, y16, y18)])
#   dichotomous outcome variable ----
sen_y_01 <- senate_data[, cbind(as.numeric(dem_vote_share > .5))]

#   dataset including lagged outcome ----
senate_data_lag <- merge(senate_data_lag[special == 0],
  senate_data_lag[special == 0, .(year = year + 6, stabb, class,
    lag_outcome = dem_vote / (rep_vote + dem_vote),
    lag_DSA = real_dem_expenditure_advantage_w_outside)],
  by = c("year", "stabb", "class"), all.x = TRUE)
senate_data_lag <- senate_data_lag[
  open_seat == 0 &
    jungle == 0 & weird_race == 0 & special == 0 &
    !is.na(ideological_distance) & !is.na(dem_vote) & !is.na(rep_vote) &
    !is.na(real_dem_expenditure_advantage_w_outside) &
    !is.na(lag_outcome)]
sen_X_lag <- data.matrix(senate_data_lag[, .(
  dem_spend_adv = real_dem_expenditure_advantage_w_outside *
    as.numeric(
      real_dem_expenditure_advantage_w_outside >= sen_tail_lims[1] &
        real_dem_expenditure_advantage_w_outside <= sen_tail_lims[2]),
  log_total_spending = log10(real_total_spending_w_outside),
  dem_presvote_advantage,
  adj_dem_presvote_advantage,
  log_dem_presvote = log10(dem_presvote),
  log_rep_presvote = log10(rep_presvote),
  ideological_distance,
  dem_cfscore, rep_cfscore,
  dem_incumbent,
  log_voting_eligible_population = log10(voting_eligible_population),
  y86, y88,
  y90, y92, y94, y96, y98,
  y00, y02, y04, y06, y08,
  y10, y12, y14, y16, y18,
  bottom_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside < sen_tail_lims[1]),
  top_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside > sen_tail_lims[2]),
  bottom_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside < sen_tail_lims[1]),
  top_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside > sen_tail_lims[2]),
  middle_90 =
    as.numeric(
      real_dem_expenditure_advantage_w_outside <= sen_tail_lims[2] &
        real_dem_expenditure_advantage_w_outside >= sen_tail_lims[1]),
  lag_outcome = lag_outcome)])
sen_y_lag <- senate_data_lag[, cbind(dem_vote_share)]

#   for numerical marginal effects ----
xS <- sen_X[, 1]
dxS <- setstep(xS)
Xp1S <- make_plus_dx_dataset(sen_X, dxS, "sen")
Xm1S <- make_plus_dx_dataset(sen_X, -dxS, "sen")

#   for CV ----
set.seed(1702567564)
sen_folds <- make_train_and_test_sets(length(sen_y), 5)
# save ----
save(
  sen_tail_lims,
  senate_data, sen_X, sen_y,
  sen_X_rf,
  sen_y_01,
  senate_data_lag, sen_X_lag, sen_y_lag,
  xS, dxS, Xp1S, Xm1S,
  sen_folds,
  file = paste0(results_path, "senate_analysis_data.RData")
)
